{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "import gc\n",
    "import datetime\n",
    "from tqdm import tqdm\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore')\n",
    "\n",
    "import tensorflow as tf\n",
    "from keras.layers import *\n",
    "from keras.models import Model, Sequential, load_model\n",
    "from keras.optimizers import *\n",
    "from keras.callbacks import ModelCheckpoint\n",
    "from keras.activations import *\n",
    "from keras.layers.advanced_activations import *\n",
    "from keras import regularizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('../input/charbusters/ChartbustersParticipantsData/Data_Train.csv')\n",
    "test = pd.read_csv('../input/charbusters/ChartbustersParticipantsData/Data_Test.csv')\n",
    "sample_sub = pd.read_excel('../input/charbusters/ChartbustersParticipantsData/Sample_Submission.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(78458, 11) (19615, 10)\n",
      "(98073, 10)\n"
     ]
    }
   ],
   "source": [
    "ID_COL, TARGET_COL = 'Unique_ID', 'Views'\n",
    "print(train.shape, test.shape)\n",
    "target = train[TARGET_COL]\n",
    "train.drop(TARGET_COL, axis=1, inplace=True)\n",
    "df = pd.concat([train, test])\n",
    "print(df.shape)\n",
    "df = df.drop('Country', axis=1)\n",
    "features = [c for c in df.columns if c not in [ID_COL, TARGET_COL]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nums.fill('na')\n",
    "errors = []\n",
    "def clean_nums(x):\n",
    "    if ',' in x:\n",
    "        x = x.replace(',', '')\n",
    "    if 'K' in x:\n",
    "        x = x.replace('K', '')\n",
    "        x = float(x) * 10**3\n",
    "    elif 'M' in x:\n",
    "        x = x.replace('M', '')\n",
    "        x = float(x) * 10**6\n",
    "    try: return float(x)\n",
    "    except:\n",
    "        errors.append(x)\n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_cols = ['Comments', 'Likes', 'Popularity', 'Followers']\n",
    "df[num_cols] = df[num_cols].apply(lambda x: x.apply(lambda x: clean_nums(str(x))))\n",
    "df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n",
    "cat_cols = ['Name', 'Genre']\n",
    "df['Song_Name'] = df['Song_Name'].fillna('xxxx')\n",
    "df['song_length'] = df['Song_Name'].apply(lambda x: len(x))\n",
    "df = df.drop('Song_Name', axis=1)\n",
    "df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x, sort=True)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_new_columns(name,aggs):\n",
    "    new_cols_list = []\n",
    "    for k in aggs.keys():\n",
    "        for agg in aggs[k]:\n",
    "            if isinstance(agg, str):\n",
    "                new_cols_list.append(name + '_' + k + '_' + agg)\n",
    "            else:\n",
    "                new_cols_list.append(name + '_' + k + agg.__name__)     \n",
    "    return new_cols_list\n",
    "\n",
    "aggs = {}\n",
    "aggs['Comments'] = ['mean', 'min', 'max']\n",
    "aggs['Likes'] = ['mean', 'min', 'max']\n",
    "aggs['Popularity'] = ['mean', 'min', 'max']\n",
    "aggs['Followers'] = ['mean', 'min', 'max']\n",
    "\n",
    "new_columns = get_new_columns('name_',aggs)\n",
    "name_grp = df.groupby('Name').agg(aggs)\n",
    "name_grp.columns = new_columns\n",
    "name_grp.reset_index(drop=False,inplace=True)\n",
    "df = pd.merge(df, name_grp, on='Name', how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['likes_per_follower'] = df['Likes']/(df['Followers'] + 1e-5)\n",
    "df['likes_per_comment'] = df['Likes']/df['Comments']\n",
    "df['likes_per_popularity'] = df['Likes']/df['Popularity']\n",
    "df['comments_per_follower'] = df['Comments']/df['Followers']\n",
    "df['Popularity_per_follower'] = df['Popularity']/df['Followers']\n",
    "df['comments_per_popularity'] = df['Comments']/df['Popularity']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['likes_prod_follower'] = df['Likes']*(df['Followers'] + 1e-5)\n",
    "df['likes_prod_comment'] = df['Likes']*df['Comments']\n",
    "df['likes_prod_popularity'] = df['Likes']*df['Popularity']\n",
    "df['comments_prod_follower'] = df['Comments']*df['Followers']#\n",
    "df['Popularity_prod_follower'] = df['Popularity']*df['Followers']#.head(2)\n",
    "df['comments_prod_popularity'] = df['Comments']*df['Popularity']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['year'] = df['Timestamp'].dt.year\n",
    "df['day'] = df['Timestamp'].dt.day\n",
    "df['weekofyear'] = df['Timestamp'].dt.weekofyear\n",
    "df['month'] = df['Timestamp'].dt.month\n",
    "df['dayofweek'] = df['Timestamp'].dt.dayofweek\n",
    "df['weekend'] = (df['Timestamp'].dt.weekday >=5).astype(int)\n",
    "df['days_since_release'] = (datetime.datetime(2019, 8, 1) - df['Timestamp']).dt.days\n",
    "df['hour'] = df['Timestamp'].dt.hour\n",
    "df['minute'] = df['Timestamp'].dt.minute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = df.sort_values(by=['Name', 'Timestamp'])\n",
    "temp['artist_next_album'] = temp.groupby('Name')['days_since_release'].shift(0) - temp.groupby('Name')['days_since_release'].shift(-1)\n",
    "temp['artist_prev_album'] = temp.groupby('Name')['days_since_release'].shift(0) - temp.groupby('Name')['days_since_release'].shift(1)\n",
    "temp['artist_next_album'] = temp['artist_next_album'].fillna(0)\n",
    "temp['artist_prev_album'] = temp['artist_prev_album'].fillna(0)*-1\n",
    "df = pd.merge(df, temp[[ID_COL, 'artist_next_album', 'artist_prev_album']], on = ID_COL, how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = df.sort_values(by=['Genre', 'Timestamp'])\n",
    "temp['genre_next_album'] = temp.groupby('Genre')['days_since_release'].shift(0) - temp.groupby('Genre')['days_since_release'].shift(-1)\n",
    "temp['genre_prev_album'] = temp.groupby('Genre')['days_since_release'].shift(0) - temp.groupby('Genre')['days_since_release'].shift(1)\n",
    "temp['genre_next_album'] = temp['genre_next_album'].fillna(0)\n",
    "temp['genre_prev_album'] = temp['genre_prev_album'].fillna(0)*-1\n",
    "df = pd.merge(df, temp[[ID_COL, 'genre_next_album', 'genre_prev_album']], on = ID_COL, how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = df.sort_values(by=['Name', 'Timestamp'])\n",
    "temp['Popularity_in_next_album'] = temp.groupby('Name')['Popularity'].shift(0) - temp.groupby('Name')['Popularity'].shift(-1)\n",
    "temp['Popularity_in_prev_album'] = temp.groupby('Name')['Popularity'].shift(0) - temp.groupby('Name')['Popularity'].shift(1)\n",
    "temp['Popularity_in_next_album'] = temp['Popularity_in_next_album'].fillna(0)\n",
    "temp['Popularity_in_prev_album'] = temp['Popularity_in_prev_album'].fillna(0)\n",
    "df = pd.merge(df, temp[[ID_COL, 'Popularity_in_next_album', 'Popularity_in_prev_album']], on = ID_COL, how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = df.sort_values(by=['Name', 'Timestamp'])\n",
    "temp[['Prev_Popularity', 'Prev_Likes', 'Prev_Followers']] = temp.groupby('Name')[['Popularity', 'Followers', 'Likes']].shift(-1)\n",
    "df = pd.merge(df, temp[[ID_COL, 'Prev_Popularity', 'Prev_Likes', 'Prev_Followers']], on = ID_COL, how = 'left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp  = df.sort_values(by = ['Name', 'Timestamp']).set_index('Timestamp')\n",
    "res = temp.groupby('Name').rolling('30D')[['Likes', 'Popularity', 'Followers']].mean().reset_index(drop=True)\n",
    "res.columns = [c + '_Name_rolling' for c in res.columns]\n",
    "res[ID_COL] = temp[ID_COL].values\n",
    "df = pd.merge(df, res, on = ID_COL, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['year_month'] = pd.factorize(df['year'].astype('str') + '_' + df['month'].astype('str'))[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in ['Comments', 'Likes', 'Popularity', 'Followers']:\n",
    "    df[c + '_yearwise_mean'] = df['year'].map(df[[c, 'year']].groupby('year')[c].mean())\n",
    "    df[c + '_yearwise_max'] = df['year'].map(df[[c, 'year']].groupby('year')[c].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['seconds_since_release'] = (datetime.datetime(2019, 8, 1) - df['Timestamp']).dt.seconds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_cols = ['Name', 'Genre', 'year_month']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "65\n",
      "62\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "likes_per_comment          2296\n",
       "likes_per_popularity       2323\n",
       "comments_per_popularity    7100\n",
       "Prev_Popularity            1219\n",
       "Prev_Likes                 1219\n",
       "Prev_Followers             1219\n",
       "dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freq_cols = [c + '_freq' for c in cat_cols]\n",
    "df[freq_cols] = df[cat_cols].apply(lambda x: x.map(x.value_counts()))\n",
    "\n",
    "features = [c for c in df.columns if c not in [ID_COL, TARGET_COL, 'Timestamp']]\n",
    "print(len(features))\n",
    "\n",
    "num_cols = [c for c in features if c not in cat_cols]\n",
    "print(len(num_cols))\n",
    "\n",
    "df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_inf(x):\n",
    "    x[x == np.inf] = 0\n",
    "    return x\n",
    "df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.mean()))\n",
    "df[num_cols] = df[num_cols].apply(lambda x: remove_inf(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series([], dtype: int64)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.mean()))\n",
    "df[num_cols] = df[num_cols].apply(lambda x: remove_inf(x))\n",
    "\n",
    "df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "60\n"
     ]
    }
   ],
   "source": [
    "to_do_log_num_cols = [c for c in num_cols if (df[c] < 0).sum() == 0]\n",
    "log_num_cols = [c +'_log' for c in num_cols if (df[c] < 0).sum() == 0]\n",
    "print(len(log_num_cols))\n",
    "df[log_num_cols] = df[to_do_log_num_cols].apply(lambda x: np.log1p(x))\n",
    "num_cols = list(set(num_cols + log_num_cols))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series([], dtype: int64)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def scale(x):\n",
    "    return (x - x.mean())/(1e-5 + x.std())\n",
    "df[num_cols] = df[num_cols].apply(lambda x: scale(x))\n",
    "df[num_cols].isnull().sum()[df[num_cols].isnull().sum() > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "122\n"
     ]
    }
   ],
   "source": [
    "train, test = df.iloc[:train.shape[0]], df.iloc[train.shape[0]:]\n",
    "test = test.reset_index(drop=True)\n",
    "\n",
    "cat_cols = ['Name', 'Genre', 'year_month']\n",
    "num_cols = list(set(num_cols))\n",
    "print(len(num_cols))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name: 1209 values\n",
      "Genre: 21 values\n",
      "year_month: 195 values\n"
     ]
    }
   ],
   "source": [
    "embed_cols = cat_cols\n",
    "non_embed_cols = num_cols\n",
    "nunique_dict = {}\n",
    "for c in embed_cols:\n",
    "    nunique_dict[c] = train[c].nunique()\n",
    "    print(c + ': %d values' % nunique_dict[c])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(125, (98073, 127))"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = list(set(embed_cols + non_embed_cols))\n",
    "len(features), df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rmse(y_true, y_pred):\n",
    "    return K.sqrt(K.mean(K.square(y_pred - y_true)))\n",
    "\n",
    "def np_rmse(y_true, y_pred):\n",
    "    return np.sqrt(mean_squared_error(y_true, y_pred))\n",
    "\n",
    "\n",
    "class PredictionCallback(tf.keras.callbacks.Callback):\n",
    "    def __init__(self, metric_info, mode = 'min', lr_decay_rate = 1, max_patience = 2, inp_size=1, file_path = 'best_model.hdf5'):\n",
    "        self.metric_name = metric_info[0]\n",
    "        self.metric = metric_info[1]\n",
    "        self.mode = mode\n",
    "        self.results = []\n",
    "        self.max_patience = max_patience\n",
    "        self.patience = 0\n",
    "        self.lr_decay_rate = lr_decay_rate\n",
    "        self.file_path = file_path\n",
    "        self.inp_size = inp_size\n",
    "        \n",
    "        \n",
    "    def on_epoch_end(self, epoch, logs={}, **kwargs):\n",
    "        y_pred = self.model.predict(self.validation_data[0: self.inp_size])\n",
    "        y_true = self.validation_data[self.inp_size]\n",
    "        \n",
    "        res = self.metric(y_true, y_pred)\n",
    "        \n",
    "        if epoch == 0:\n",
    "            print('->' * 22 + ' EPOCH : {}  *** {} *** : {}'.format(epoch + 1, self.metric_name, res))\n",
    "#             print('\\nSaving Model\\n')\n",
    "            self.model.save(self.file_path)\n",
    "        \n",
    "        else:\n",
    "            best = min(self.results + [res]) if self.mode == 'min' else max(self.results + [res])\n",
    "            print('->' * 22 + ' EPOCH : {}  *** {} *** : {} *** BEST *** : {}'.format(epoch + 1, self.metric_name, res, best))\n",
    "            if (self.mode == 'min' and res >= min(self.results)) or (self.mode == 'max' and res <= max(self.results)):\n",
    "                self.patience += 1\n",
    "\n",
    "                if self.patience >= self.max_patience:\n",
    "                    lr = self.model.optimizer.lr\n",
    "                    new_lr = K.get_value(lr) * self.lr_decay_rate\n",
    "                    K.set_value(lr, new_lr)\n",
    "                    print(\"\\nMetric did not improve for {} iterations. Changing learning rate to: {}\\n\".format(self.max_patience, new_lr))\n",
    "                    self.patience = 0    \n",
    "            else:\n",
    "#                 print('\\nSaving Model\\n')\n",
    "                self.model.save(self.file_path)\n",
    "            \n",
    "        self.results.append(res)\n",
    "        print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_embedding_network(categorical_vars, max_embed_size=50):\n",
    "    embeddings = []\n",
    "    inputs = []\n",
    "\n",
    "    for categorical_var in categorical_vars:    \n",
    "        single_input = Input(shape=(1,))\n",
    "        no_of_unique_cat  = nunique_dict[categorical_var]\n",
    "        embedding_size = min(np.ceil((no_of_unique_cat)/2), max_embed_size)\n",
    "        embedding_size = max(10, embedding_size)\n",
    "        embedding_size = int(embedding_size)\n",
    "        embedding = Embedding(no_of_unique_cat, embedding_size, input_length=1, trainable=True)(single_input)\n",
    "        embedding = Reshape(target_shape=(embedding_size,))(embedding)\n",
    "        inputs.append(single_input)\n",
    "        embeddings.append(embedding)\n",
    "\n",
    "    input_numeric = Input(shape=(len(non_embed_cols),))\n",
    "    inputs.append(input_numeric)\n",
    "    embedding_numeric = Dense(32)(input_numeric) \n",
    "    embeddings.append(input_numeric)\n",
    "    \n",
    "    x = Concatenate()(embeddings)\n",
    "    for i in range(5):\n",
    "        x = Dense(256*9)(x)\n",
    "        x =  ReLU()(x)\n",
    "        x = Dropout(0.27)(x)\n",
    "    output = Dense(1, activation='linear')(x)\n",
    "    model = Model(inputs, output)\n",
    "    opt = Nadam(lr=1e-3)\n",
    "    model.compile(loss=rmse, optimizer=opt)\n",
    "    \n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "fold n°1\n",
      "fold n°2\n",
      "Train on 66731 samples, validate on 11727 samples\n",
      "Epoch 1/25\n",
      "66731/66731 [==============================] - 11s 171us/step - loss: 951537.2310 - val_loss: 472387.4309\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 1  *** RMSE *** : 662757.0538417881\n",
      "\n",
      "Epoch 2/25\n",
      "66731/66731 [==============================] - 10s 155us/step - loss: 626512.2151 - val_loss: 398903.0004\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 2  *** RMSE *** : 589426.3334266266 *** BEST *** : 589426.3334266266\n",
      "\n",
      "Epoch 3/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 595000.1378 - val_loss: 549369.8525\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 3  *** RMSE *** : 764489.5944089866 *** BEST *** : 589426.3334266266\n",
      "\n",
      "Epoch 4/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 548528.1185 - val_loss: 412999.6610\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 4  *** RMSE *** : 605868.0387239312 *** BEST *** : 589426.3334266266\n",
      "\n",
      "Epoch 5/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 525244.8273 - val_loss: 382405.6189\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 5  *** RMSE *** : 553484.6254531835 *** BEST *** : 553484.6254531835\n",
      "\n",
      "Epoch 6/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 515462.3189 - val_loss: 350234.5620\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 6  *** RMSE *** : 549183.667323038 *** BEST *** : 549183.667323038\n",
      "\n",
      "Epoch 7/25\n",
      "66731/66731 [==============================] - 10s 156us/step - loss: 504518.4537 - val_loss: 467765.5682\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 7  *** RMSE *** : 616467.7797778473 *** BEST *** : 549183.667323038\n",
      "\n",
      "Metric did not improve for 3 iterations. Changing learning rate to: 0.0007000000332482159\n",
      "\n",
      "\n",
      "Epoch 8/25\n",
      "66731/66731 [==============================] - 10s 152us/step - loss: 455574.5513 - val_loss: 312472.1538\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 8  *** RMSE *** : 447192.39691035735 *** BEST *** : 447192.39691035735\n",
      "\n",
      "Epoch 9/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 445978.5030 - val_loss: 490899.2792\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 9  *** RMSE *** : 801239.3787345117 *** BEST *** : 447192.39691035735\n",
      "\n",
      "Epoch 10/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 432018.0409 - val_loss: 350699.4672\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 10  *** RMSE *** : 498982.5953952272 *** BEST *** : 447192.39691035735\n",
      "\n",
      "Epoch 11/25\n",
      "66731/66731 [==============================] - 10s 149us/step - loss: 425125.4041 - val_loss: 298687.5648\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 11  *** RMSE *** : 470725.85380035714 *** BEST *** : 447192.39691035735\n",
      "\n",
      "Metric did not improve for 3 iterations. Changing learning rate to: 0.0004900000232737511\n",
      "\n",
      "\n",
      "Epoch 12/25\n",
      "66731/66731 [==============================] - 10s 152us/step - loss: 377789.2655 - val_loss: 285622.3645\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 12  *** RMSE *** : 416677.1341433743 *** BEST *** : 416677.1341433743\n",
      "\n",
      "Epoch 13/25\n",
      "66731/66731 [==============================] - 10s 155us/step - loss: 391616.1822 - val_loss: 344945.4448\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 13  *** RMSE *** : 455490.60842001834 *** BEST *** : 416677.1341433743\n",
      "\n",
      "Epoch 14/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 399856.8078 - val_loss: 281249.6913\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 14  *** RMSE *** : 409647.727984421 *** BEST *** : 409647.727984421\n",
      "\n",
      "Epoch 15/25\n",
      "66731/66731 [==============================] - 10s 150us/step - loss: 385786.7462 - val_loss: 329655.5859\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 15  *** RMSE *** : 469537.20283493196 *** BEST *** : 409647.727984421\n",
      "\n",
      "Epoch 16/25\n",
      "66731/66731 [==============================] - 10s 151us/step - loss: 377041.8217 - val_loss: 324781.9153\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 16  *** RMSE *** : 452199.7813672665 *** BEST *** : 409647.727984421\n",
      "\n",
      "Metric did not improve for 3 iterations. Changing learning rate to: 0.00034300000406801696\n",
      "\n",
      "\n",
      "Epoch 17/25\n",
      "66731/66731 [==============================] - 10s 149us/step - loss: 347685.7166 - val_loss: 334782.6034\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 17  *** RMSE *** : 431069.4601080937 *** BEST *** : 409647.727984421\n",
      "\n",
      "Epoch 18/25\n",
      "66731/66731 [==============================] - 10s 153us/step - loss: 350298.1959 - val_loss: 301355.0331\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 18  *** RMSE *** : 473927.9245912565 *** BEST *** : 409647.727984421\n",
      "\n",
      "Epoch 19/25\n",
      "66731/66731 [==============================] - 10s 152us/step - loss: 349565.3264 - val_loss: 272138.7460\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 19  *** RMSE *** : 403645.55425543385 *** BEST *** : 403645.55425543385\n",
      "\n",
      "Epoch 20/25\n",
      "66731/66731 [==============================] - 10s 149us/step - loss: 338585.0039 - val_loss: 417269.5140\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 20  *** RMSE *** : 637232.1541487213 *** BEST *** : 403645.55425543385\n",
      "\n",
      "Metric did not improve for 3 iterations. Changing learning rate to: 0.00024009999469853935\n",
      "\n",
      "\n",
      "Epoch 21/25\n",
      "66731/66731 [==============================] - 10s 151us/step - loss: 334122.6973 - val_loss: 263208.6277\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 21  *** RMSE *** : 379582.7725031597 *** BEST *** : 379582.7725031597\n",
      "\n",
      "Epoch 22/25\n",
      "66731/66731 [==============================] - 10s 152us/step - loss: 315699.0382 - val_loss: 290666.3906\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 22  *** RMSE *** : 414198.1862638511 *** BEST *** : 379582.7725031597\n",
      "\n",
      "Epoch 23/25\n",
      "66731/66731 [==============================] - 10s 152us/step - loss: 322004.4420 - val_loss: 293717.9430\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 23  *** RMSE *** : 411609.5986928519 *** BEST *** : 379582.7725031597\n",
      "\n",
      "Epoch 24/25\n",
      "66731/66731 [==============================] - 10s 155us/step - loss: 318272.6059 - val_loss: 292926.9065\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 24  *** RMSE *** : 456422.74723022274 *** BEST *** : 379582.7725031597\n",
      "\n",
      "Metric did not improve for 3 iterations. Changing learning rate to: 0.00016806999628897755\n",
      "\n",
      "\n",
      "Epoch 25/25\n",
      "66731/66731 [==============================] - 10s 149us/step - loss: 319719.9662 - val_loss: 267782.6925\n",
      "->->->->->->->->->->->->->->->->->->->->->-> EPOCH : 25  *** RMSE *** : 416448.58446696657 *** BEST *** : 379582.7725031597\n",
      "\n",
      "Fold score: 379582.70857\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "fold_split_col = pd.Series(pd.factorize(train['Name'].astype('str') + train['year'].astype('str') + train['Genre'].astype('str'))[0])\n",
    "max_iter = 7\n",
    "folds = StratifiedKFold(n_splits=max_iter, random_state=1991)\n",
    "oof = np.zeros(len(train))\n",
    "predictions = np.zeros(len(test))\n",
    "file_path = 'best_model.hdf5'\n",
    "\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, fold_split_col)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    X_trn,  y_trn  = train.loc[trn_idx][features], target[trn_idx]\n",
    "    X_val, y_val = train.loc[val_idx][features], target[val_idx]\n",
    "    if fold_ < 2:\n",
    "        continue\n",
    "    val_maps = {}\n",
    "    for c in embed_cols:\n",
    "        val_maps[c] = pd.Series(np.unique(X_trn[c]))\n",
    "        val_maps[c] = pd.Series(data = val_maps[c].index, index = val_maps[c].values)\n",
    "    proc_X_trn = []\n",
    "    for c in embed_cols:\n",
    "        proc_X_trn.append(X_trn[c].map(val_maps[c]).values)\n",
    "    proc_X_trn.append(X_trn[non_embed_cols].values)\n",
    "    proc_X_val = []\n",
    "    for c in embed_cols:\n",
    "        proc_X_val.append(X_val[c].map(val_maps[c]).values)\n",
    "    proc_X_val.append(X_val[non_embed_cols].values)\n",
    "    proc_X_test = []\n",
    "    for c in embed_cols:\n",
    "        proc_X_test.append(test[c].map(val_maps[c]).values)\n",
    "    proc_X_test.append(test[non_embed_cols].values)\n",
    "    NN = build_embedding_network(embed_cols, 10)\n",
    "    \n",
    "    pcb = PredictionCallback(('RMSE', np_rmse), inp_size=len(proc_X_val), file_path = file_path, max_patience = 3, lr_decay_rate = 0.7)\n",
    "    callbacks_list = [pcb]\n",
    "    \n",
    "    history = NN.fit(proc_X_trn, y_trn, batch_size=100, epochs=25, validation_data = [proc_X_val, y_val], callbacks=callbacks_list)\n",
    "    NN = load_model(file_path,  custom_objects={'rmse':rmse})\n",
    "    \n",
    "    oof[val_idx] = NN.predict(proc_X_val, 256*64)[:,0]\n",
    "    print(\"Fold score: {:<8.5f}\".format(np.sqrt(mean_squared_error(y_val, oof[val_idx]))))\n",
    "    \n",
    "    current_pred = NN.predict(proc_X_test, 256*64)[:,0]\n",
    "    print((current_pred < 0).sum())\n",
    "    current_pred[current_pred < 0] = 0\n",
    "    \n",
    "    sub_df = pd.DataFrame()\n",
    "    sub_df[ID_COL] = test[ID_COL]\n",
    "    sub_df[TARGET_COL] = current_pred\n",
    "    sub_df.to_excel('nn_ ' + str(fold_) + '.xlsx', index=False)\n",
    "    \n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "print((current_pred < 0).sum())\n",
    "current_pred[current_pred < 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold score: 379582.70857\n"
     ]
    }
   ],
   "source": [
    "print(\"Fold score: {:<8.5f}\".format(np.sqrt(mean_squared_error(y_val, oof[val_idx]))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unique_ID</th>\n",
       "      <th>Views</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>562546</td>\n",
       "      <td>180370.187500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>907584</td>\n",
       "      <td>31119.177734</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>213013</td>\n",
       "      <td>12470.263672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>340312</td>\n",
       "      <td>29852.271484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>41854</td>\n",
       "      <td>16728.867188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1031846</td>\n",
       "      <td>5861.603027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>627446</td>\n",
       "      <td>4253.577637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1509777</td>\n",
       "      <td>19816.429688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1270147</td>\n",
       "      <td>3305.786621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1486926</td>\n",
       "      <td>36350.460938</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unique_ID          Views\n",
       "0     562546  180370.187500\n",
       "1     907584   31119.177734\n",
       "2     213013   12470.263672\n",
       "3     340312   29852.271484\n",
       "4      41854   16728.867188\n",
       "5    1031846    5861.603027\n",
       "6     627446    4253.577637\n",
       "7    1509777   19816.429688\n",
       "8    1270147    3305.786621\n",
       "9    1486926   36350.460938"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub_df.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
